Loading the libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(broom)
library(styler)
gapminder_data <- read.csv("/Users/yasemindilarasucu/Desktop/gapminder_data_ys/gapminder_clean.csv")
Filtering the Data by the year 1962
filtered_data_1962 <- gapminder_data %>%
filter(Year == 1962)
Creating a Scatter plot
ggplot(filtered_data_1962, aes(x = " CO2 emissions (metric tons per capital) ", y = gdpPercap)) +
geom_point() +
labs(
title = "Scatter Plot of CO2 Emissions vs. GDP per Capita (1962)",
x = "CO2 Emissions (metric tons per capita)",
y = "GDP per Capita"
)
## Warning: Removed 131 rows containing missing values (`geom_point()`).
Checking for the missing values in the dataset
summary(gapminder_data)
## X Country.Name Year
## Min. : 0.0 Length:2607 Min. :1962
## 1st Qu.: 651.5 Class :character 1st Qu.:1972
## Median :1303.0 Mode :character Median :1987
## Mean :1303.0 Mean :1985
## 3rd Qu.:1954.5 3rd Qu.:1997
## Max. :2606.0 Max. :2007
##
## Agriculture..value.added....of.GDP. CO2.emissions..metric.tons.per.capita.
## Min. : 0.000 Min. : 0.0017
## 1st Qu.: 6.745 1st Qu.: 0.4334
## Median :16.942 Median : 1.5999
## Mean :19.992 Mean : 4.1472
## 3rd Qu.:30.807 3rd Qu.: 5.6550
## Max. :94.846 Max. :82.7189
## NA's :1179 NA's :414
## Domestic.credit.provided.by.financial.sector....of.GDP.
## Min. :-73.66
## 1st Qu.: 21.71
## Median : 39.17
## Mean : 50.06
## 3rd Qu.: 64.57
## Max. :301.19
## NA's :864
## Electric.power.consumption..kWh.per.capita.
## Min. : 7.61
## 1st Qu.: 347.36
## Median : 1216.23
## Mean : 2750.88
## 3rd Qu.: 3735.53
## Max. :36852.54
## NA's :1238
## Energy.use..kg.of.oil.equivalent.per.capita.
## Min. : 9.72
## 1st Qu.: 522.66
## Median : 1034.63
## Mean : 2131.31
## 3rd Qu.: 2835.76
## Max. :36146.70
## NA's :1197
## Exports.of.goods.and.services....of.GDP.
## Min. : 0.1382
## 1st Qu.: 16.3829
## Median : 26.4771
## Mean : 32.1762
## 3rd Qu.: 41.3692
## Max. :214.7423
## NA's :798
## Fertility.rate..total..births.per.woman. GDP.growth..annual...
## Min. :0.852 Min. :-44.900
## 1st Qu.:2.331 1st Qu.: 1.486
## Median :4.052 Median : 3.812
## Mean :4.217 Mean : 3.997
## 3rd Qu.:6.097 3rd Qu.: 6.351
## Max. :8.838 Max. :149.973
## NA's :184 NA's :691
## Imports.of.goods.and.services....of.GDP. Industry..value.added....of.GDP.
## Min. : 0.0795 Min. : 3.481
## 1st Qu.: 20.4815 1st Qu.: 20.812
## Median : 30.1252 Median : 29.146
## Mean : 37.1610 Mean : 29.742
## 3rd Qu.: 47.6579 3rd Qu.: 36.274
## Max. :330.5633 Max. :210.607
## NA's :798 NA's :1189
## Inflation..GDP.deflator..annual... Life.expectancy.at.birth..total..years.
## Min. : -15.424 Min. :19.27
## 1st Qu.: 2.775 1st Qu.:53.96
## Median : 5.700 Median :65.19
## Mean : 22.732 Mean :62.46
## 3rd Qu.: 10.099 3rd Qu.:71.08
## Max. :4078.476 Max. :82.51
## NA's :706 NA's :189
## Population.density..people.per.sq..km.of.land.area.
## Min. : 0.102
## 1st Qu.: 17.443
## Median : 46.113
## Mean : 260.380
## 3rd Qu.: 119.047
## Max. :20601.550
## NA's :49
## Services..etc...value.added....of.GDP. pop continent
## Min. : 10.07 Min. :6.534e+04 Length:2607
## 1st Qu.: 40.45 1st Qu.:3.134e+06 Class :character
## Median : 49.37 Median :7.455e+06 Mode :character
## Mean : 50.62 Mean :3.331e+07
## 3rd Qu.: 60.96 3rd Qu.:1.987e+07
## Max. :100.00 Max. :1.319e+09
## NA's :1186 NA's :1323
## gdpPercap
## Min. : 347
## 1st Qu.: 1253
## Median : 4151
## Mean : 8046
## 3rd Qu.: 10994
## Max. :109348
## NA's :1323
summary(filtered_data_1962)
## X Country.Name Year
## Min. : 0 Length:259 Min. :1962
## 1st Qu.: 645 Class :character 1st Qu.:1962
## Median :1290 Mode :character Median :1962
## Mean :1294 Mean :1962
## 3rd Qu.:1945 3rd Qu.:1962
## Max. :2597 Max. :1962
##
## Agriculture..value.added....of.GDP. CO2.emissions..metric.tons.per.capita.
## Min. : 4.903 Min. : 0.00848
## 1st Qu.:33.994 1st Qu.: 0.20073
## Median :40.223 Median : 0.65171
## Mean :40.028 Mean : 2.25427
## 3rd Qu.:45.904 3rd Qu.: 1.94326
## Max. :94.846 Max. :42.63712
## NA's :210 NA's :64
## Domestic.credit.provided.by.financial.sector....of.GDP.
## Min. : -1.513
## 1st Qu.: 12.598
## Median : 21.096
## Mean : 26.853
## 3rd Qu.: 34.333
## Max. :111.050
## NA's :154
## Electric.power.consumption..kWh.per.capita.
## Min. : 111.8
## 1st Qu.:1336.9
## Median :2006.8
## Mean :2467.4
## 3rd Qu.:2837.9
## Max. :9391.0
## NA's :228
## Energy.use..kg.of.oil.equivalent.per.capita.
## Min. : 350.1
## 1st Qu.: 1567.7
## Median : 2081.0
## Mean : 2527.1
## 3rd Qu.: 2938.8
## Max. :10414.5
## NA's :228
## Exports.of.goods.and.services....of.GDP.
## Min. : 3.518
## 1st Qu.: 9.283
## Median : 15.948
## Mean : 21.706
## 3rd Qu.: 27.095
## Max. :138.181
## NA's :139
## Fertility.rate..total..births.per.woman. GDP.growth..annual...
## Min. :1.790 Min. :-19.685
## 1st Qu.:4.245 1st Qu.: 3.296
## Median :6.065 Median : 5.112
## Mean :5.480 Mean : 5.066
## 3rd Qu.:6.750 3rd Qu.: 6.700
## Max. :8.197 Max. : 24.521
## NA's :23 NA's :135
## Imports.of.goods.and.services....of.GDP. Industry..value.added....of.GDP.
## Min. : 2.908 Min. : 3.52
## 1st Qu.: 11.683 1st Qu.:14.14
## Median : 19.248 Median :20.40
## Mean : 24.223 Mean :21.90
## 3rd Qu.: 29.432 3rd Qu.:30.43
## Max. :148.588 Max. :41.72
## NA's :139 NA's :215
## Inflation..GDP.deflator..annual... Life.expectancy.at.birth..total..years.
## Min. : -7.9713 Min. :28.55
## 1st Qu.: 0.6961 1st Qu.:44.82
## Median : 2.3513 Median :54.29
## Mean : 5.3079 Mean :54.26
## 3rd Qu.: 4.0644 3rd Qu.:64.73
## Max. :178.6815 Max. :73.72
## NA's :148 NA's :23
## Population.density..people.per.sq..km.of.land.area.
## Min. : 0.102
## 1st Qu.: 10.832
## Median : 27.698
## Mean : 180.998
## 3rd Qu.: 87.879
## Max. :11521.000
## NA's :6
## Services..etc...value.added....of.GDP. pop continent
## Min. : 15.42 Min. : 65345 Length:259
## 1st Qu.: 35.03 1st Qu.: 1852686 Class :character
## Median : 39.56 Median : 4569171 Mode :character
## Mean : 41.44 Mean : 21536525
## 3rd Qu.: 44.72 3rd Qu.: 10812216
## Max. :100.00 Max. :665770000
## NA's :213 NA's :131
## gdpPercap
## Min. : 355.2
## 1st Qu.: 1056.2
## Median : 2510.7
## Mean : 4958.9
## 3rd Qu.: 6085.3
## Max. :95458.1
## NA's :131
Imputing the NA values for the columns of CO2 Emissions (metric tons per capita and GDP per Capita
filtered_data_1962_imputed <- filtered_data_1962 %>%
mutate(
"CO2 Emissions (metric tons per capita)" = ifelse(
is.na("CO2 Emissions (metric tons per capita)"),
mean("CO2 Emissions (metric tons per capita)", na.rm = TRUE),
"CO2 Emissions (metric tons per capita)"
),
gdpPercap = ifelse(
is.na(gdpPercap),
mean(gdpPercap, na.rm = TRUE),
gdpPercap
)
)
p <- ggplot(filtered_data_1962_imputed, aes(x = `CO2.emissions..metric.tons.per.capita.`, y = gdpPercap, color = continent)) +
geom_point() +
labs(
title = "Scatter Plot of CO2 Emissions vs. GDP per Capita (1962)",
x = "CO2 Emissions (metric tons per capita)",
y = "GDP per Capita"
)
#applying a theme
p + theme_classic()
## Warning: Removed 64 rows containing missing values (`geom_point()`).
correlation_result <- cor.test(filtered_data_1962_imputed$`CO2.emissions..metric.tons.per.capita.`,
filtered_data_1962_imputed$gdpPercap,
method = "pearson"
)
print(correlation_result)
##
## Pearson's product-moment correlation
##
## data: filtered_data_1962_imputed$CO2.emissions..metric.tons.per.capita. and filtered_data_1962_imputed$gdpPercap
## t = 14.441, df = 193, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6455064 0.7819824
## sample estimates:
## cor
## 0.7206543
# Finding the year with the strongest correlation
correlation_results <- lapply(unique(gapminder_data$Year), function(year) {
subset_data <- gapminder_data %>% filter(Year == year)
correlation_result <- cor.test(subset_data$`CO2.emissions..metric.tons.per.capita.`,
subset_data$gdpPercap,
method = "pearson"
)
return(data.frame(Year = year, correlation = correlation_result$estimate))
})
correlation_results_df <- do.call(rbind, correlation_results)
strongest_correlation_year <- correlation_results_df[which.max(correlation_results_df$correlation), "Year"]
# Filtering the data for the year of the strongest correlation
filtered_data_strongest_correlation <- gapminder_data %>% filter(Year == strongest_correlation_year)
# Creating a scatter-plot with ggplot2 with the filtered data from the previous step
gg_plot <- ggplot(filtered_data_strongest_correlation, aes(x = `CO2.emissions..metric.tons.per.capita.`, y = gdpPercap, size = pop, color = continent)) +
geom_point() +
labs(
title = "Interactive Scatter Plot with Plotly",
x = "CO2 Emissions (metric tons per capita)",
y = "GDP per Capita"
)
# Converting ggplot2 plot to plotly
plotly_plot <- ggplotly(gg_plot)
plotly_plot
# Checking the significancy with ANOVA test
anova_result <- aov(`Energy.use..kg.of.oil.equivalent.per.capita.` ~ continent, data = gapminder_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 5 8.124e+08 162482656 21.88 <2e-16 ***
## Residuals 1404 1.043e+10 7426183
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness
Commenting on the results that the analysis shows that there are significant differences in Energy use among the different continents.
Visualization:
# Interactive Boxplot
plot_ly(gapminder_data, x = ~continent, y = ~`Energy.use..kg.of.oil.equivalent.per.capita.`, type = "box") %>%
layout(
title = "Energy Use Across Continents",
xaxis = list(title = "Continent"),
yaxis = list(title = "Energy Use (kg of oil equivalent per capita)")
)
## Warning: Ignoring 1197 observations
# Checking unique levels of the 'continent' variable
unique(gapminder_data$continent)
## [1] "Asia" "Europe" "Africa" "" "Americas" "Oceania"
# Filter data for non-empty continents and years after 1990
filtered_data_after_1990_cleaned <- gapminder_data %>%
filter(Year > 1990, continent %in% c("Asia", "Europe", "Africa", "Americas", "Oceania"))
# Explicitly set factor levels for 'continent'
filtered_data_after_1990_cleaned$continent <- factor(filtered_data_after_1990_cleaned$continent, levels = c("Asia", "Europe"))
# Check unique levels of the 'continent' variable
unique(filtered_data_after_1990_cleaned$continent)
## [1] Asia Europe <NA>
## Levels: Asia Europe
# Using the t-test for to compare the means of "Import of goods and services (% of GDP)" between Europe and Asia after 1990
t_test_result <- t.test(`Imports.of.goods.and.services....of.GDP.` ~ continent, data = filtered_data_after_1990_cleaned)
t_test_result
##
## Welch Two Sample t-test
##
## data: Imports.of.goods.and.services....of.GDP. by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
## -2.321099 12.433240
## sample estimates:
## mean in group Asia mean in group Europe
## 46.84531 41.78924
#According the t-test results that there is no significant difference in the means of “Import of goods and services (% of GDP)” between Europe and Asia after 1990, the p-value is 0.1776, which is greater than the typical significance level of 0.05. Therefore, there is no enough evidence to reject the null hypothesis that true difference in means between Asia and Europe is equal to 0.
Visualization
# Interactive Bar Plot
plot_ly(filtered_data_after_1990_cleaned, x = ~continent, y = ~`Imports.of.goods.and.services....of.GDP.`, type = "bar") %>%
layout(
title = "Imports of Goods and Services (% of GDP) - Europe vs. Asia (After 1990)",
xaxis = list(title = "Continent"),
yaxis = list(title = "Imports (% of GDP)")
)
## Warning: Ignoring 304 observations
# Since there is no need for a statistical test for this question, instead let's calculate the Population Density for each country across all years and identify the one with the highest average.
# Calculate average 'Population Density' for each country
average_population_density <- gapminder_data %>%
group_by(Country.Name) %>%
summarize(avg_population_density = mean(`Population.density..people.per.sq..km.of.land.area.`, na.rm = TRUE)) %>%
arrange(desc(avg_population_density))
# Display the country with the highest average population density
head(average_population_density, 1)
## # A tibble: 1 × 2
## Country.Name avg_population_density
## <chr> <dbl>
## 1 Macao SAR, China 14732.
Visualization
# Interactive Bar Plot
plot_ly(average_population_density, x = ~Country.Name, y = ~avg_population_density, type = "bar") %>%
layout(
title = "Average Population Density Across All Years",
xaxis = list(title = "Country"),
yaxis = list(title = "Average Population Density")
)
## Warning: Ignoring 1 observations
# Since there is no need for statistical test for this one, let's calculate the difference in "Life expectancy in between 1962 and 2007 for each country and identifying the one with he greatest increase.
# Calculate the difference in life expectancy between 1962 and 2007 for each country
life_expectancy_difference <- gapminder_data %>%
filter(Year %in% c(1962, 2007)) %>%
group_by(Country.Name) %>%
summarize(difference = diff(`Life.expectancy.at.birth..total..years.`)) %>%
arrange(desc(difference))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Country.Name'. You can override using the
## `.groups` argument.
# Display the country with the greatest increase in life expectancy
head(life_expectancy_difference, 1)
## # A tibble: 1 × 2
## # Groups: Country.Name [1]
## Country.Name difference
## <chr> <dbl>
## 1 Maldives 36.9
Visualization
# Create an interactive scatter plot
plot_ly(
data = life_expectancy_difference, x = ~difference, y = ~Country.Name, type = "scatter", mode = "markers",
marker = list(color = ~difference, colorscale = "Viridis"),
text = ~ paste("Country: ", Country.Name, "<br>Life Expectancy Difference: ", round(difference, 2))
) %>%
layout(
title = "Difference in Life Expectancy (2007 - 1962) for Each Country",
xaxis = list(title = "Life Expectancy Difference"),
yaxis = list(title = "Country"),
hovermode = "closest"
)
## Warning: Ignoring 23 observations